Setting the options for knitr…
library(knitr)
knitr::opts_chunk$set(results = 'asis', # Can also be set at the chunk-level
echo = TRUE,
comment = NA,
prompt = FALSE,
warning = FALSE,
message = FALSE,
fig.align = "center",
fig.width = 8.125,
out.width = "100%",
fig.path = "figures_umap/",
dev = c('png', 'tiff'),
dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
cache = FALSE,
cache.lazy = FALSE,
autodep = TRUE)
options(width = 1000)
options(knitr.kable.NA = '')Loading libraries…
library(dplyr)
library(tidyverse) # The tidy verse
library(gridExtra)
library(uwot)
#require(devtools)
#install_version("clues", version = "0.6.2.2", repos = "http://cran.us.r-project.org")
library(clues)
library(parallel)
library(plotly)Cleaning the memory…
rm(list = ls(all.names = TRUE))Loading the dataset…
filename <- "Datasets/dataset_processed.txt"
df <- read.table(filename, header = TRUE, sep = "\t", dec = ".", quote ="", stringsAsFactors = T) %>% as_tibble()The sets of predictors we consider are:
We define the various sets of predictors…
We create a (named) list of these sets…
We define a function standardize() to standardize the values of a continuous variable (i.e., compute the z-score of)…
standardize <- function(x, na.rm = FALSE) {
if (sd(x, na.rm) != 0)
return ((x - mean(x, na.rm = na.rm)) / sd(x, na.rm))
else
return (x - mean(x, na.rm = na.rm))
}We apply this function to all our target numerical variables…
df <- df %>% mutate_at(c(Bioacoustic_set, DCT_set, MFCC_set), standardize, na.rm = TRUE)We define the function get_umap() which takes as input a
data frame (or tibble) which contains each raw initial features from the
three sets (Bioacoustic, DCT,
MFCC) and target labels (individual or
type). The target parameter y that specifies
the supervised target information passes this target column to the UMAP
model during fitting and it will make use of it to perform a supervised
dimension reduction (S-UMAP).
The umap() function in the uwot package
includes several others parameters to build a low-dimensional graph. The
four main parameters are n_neighbors which determines the
number of neighbors used in constructing the nearest neighbor graph,
min_dist which determines the allowed separation between
connected embeddings, n_components which is the
dimensionality of the embedding space, and metric which
defines the distance metric (e.g., Euclidean, cosine) used to define the
distances between points in the high-dimensional data space. We used the
default settings for each, i.e., 100 neighbors, a minimal distance of
0.01, two dimensions and the Euclidean distance metric.
In order to estimate the quality of the partitioning provided by UMAP
and thus quantify the degree of clustering of call types and individual
signatures, we computed the values of the silhouette coefficients
(Rousseeuw, 1987). We used function get_Silhouette in the
clues package.
UMAP is a stochastic algorithm. This means that different runs of
UMAP may have variations. To ensure that results can be reproduced, the
umap() function supports to set a random seed state.
get_umap_and_silhouette <- function(df_umap, target_DV, target_algo, target_set, n_neighbors = 15, metric = "euclidean", min_dist = 0.01, size = 0.5, alpha = 0.5) {
set.seed(123)
df_umap <- df_umap %>%
filter(algo == target_algo, set == target_set) %>%
dplyr::select(-algo, -set)
if (target_DV == "individual")
other_V <- "type"
if (target_DV == "type")
other_V <- "individual"
m <- df_umap %>%
dplyr::select(-id, -individual, -type)
umap_coords <- m %>%
umap(n_neighbors = n_neighbors, n_components = 2, metric = "euclidean", min_dist = 0.01, n_threads = detectCores()-2, y= df_umap[[target_DV]]) %>%
data.frame()
umap_coords[[target_DV]] <- df_umap[[target_DV]]
umap_coords[[other_V]] <- df_umap[[other_V]]
p_umap <- umap_coords %>% ggplot(aes(x = X1, y = X2, col = .data[[target_DV]])) +
geom_point(size = size, alpha = alpha) +
scale_color_brewer(palette = "Paired") +
guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1.0))) +
ggtitle(paste0("UMAP - ", target_algo, "\n", target_set))
p_umap2 <- umap_coords %>% ggplot(aes(x = X1, y = X2, col = .data[[target_DV]])) +
geom_point(size = 1, alpha = 0.5) +
scale_color_brewer(palette = "Paired") +
guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1.0))) +
ggtitle(paste0("UMAP - ", target_algo, "\n", target_set)) +
facet_wrap(vars(.data[[other_V]]))
silhouette <- get_Silhouette(as.matrix(umap_coords[, 1:2]), umap_coords[[target_DV]], disMethod = "Euclidean")
silhouette.df <- data.frame(sil = silhouette$s, observed = umap_coords[[target_DV]])
silhouette.df <- silhouette.df[order(silhouette.df$observed, -silhouette.df$sil), ]
silhouette.df$name <- factor(rownames(silhouette.df), levels = rownames(silhouette.df))
means.per.group <- silhouette.df %>%
mutate(name= as.integer(name)) %>%
group_by(observed) %>%
summarise(mean.group=round(mean(sil), 2), mean.name= mean(name), min = name[which.min(name)], max = name[which.max(name)])
sil_plot <- silhouette.df %>%
ggplot(., aes(x = name, y = sil, color = observed, fill = observed)) +
geom_col() +
labs(y = "Silhouette value",
x = "",
fill="Cluster",
color="Cluster",
title = paste0("Silhouette plot - ", target_algo, "\n", target_set),
subtitle = paste0("Average silhouette value: ", round(mean(silhouette.df$sil), 2)) ) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
ggplot2::ylim(c(-1, 1)) +
#geom_hline(yintercept = mean(silhouette.df$sil), linetype = "dashed", color = "red", size=0.75) +
geom_text(data=means.per.group, aes(x=mean.name, y=mean.group, label= mean.group), color="black", hjust = -0.05) +
geom_segment(data = means.per.group, aes(x = min - 0.5, xend = max + 0.5, y = mean.group, yend = mean.group), size=0.75, linetype = "dashed", color="black") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip() +
guides(fill="none", color="none")
return (list(umap_coords = umap_coords, umap = p_umap, umap2 = p_umap2, silhouette = silhouette, silhouette_plot = sil_plot))
}target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)
df_umap <- df %>%
select(one_of(target_columns)) %>%
mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")
initial_data_type <- df_umap %>% get_umap_and_silhouette("type", "Raw features", "Bioacoustic, DCT, MFCC", size = 1, n_neighbors = 100)initial_data_type$umapinitial_data_type$umap2initial_data_type$silhouette_plotdescriptors <- paste0(df %>% pull(individual), ", ", df %>% pull(type), ", seq: ", df %>% pull(sequence), ", voc: ", df %>% pull(vocalization))
initial_data_type$umap_coords %>% plot_ly(x = ~X1, y = ~X2, color = ~type, colors = "Paired", type = 'scatter', mode = 'markers') %>%
layout(
plot_bgcolor = "#ffffff",
legend=list(title=list(text='Type')),
xaxis = list(
title = "0"),
yaxis = list(
title = "1")) %>%
add_markers(text = descriptors)target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)
df_umap <- df %>%
dplyr::select(one_of(target_columns)) %>%
mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")
initial_data_ind <- df_umap %>% get_umap_and_silhouette("individual", "Raw features", "Bioacoustic, DCT, MFCC", size = 1, n_neighbors = 100)cols = c("#C84248", "#8569D5", "#5E738F", "#D1A33D", "#8A7C64", "#599861", "#89C5DA", "#DA5724", "#74D944", "#CE50CA")
initial_data_ind$umap <- initial_data_ind$umap +
scale_color_manual(values=cols)
initial_data_ind$umapinitial_data_ind$umap2 <- initial_data_ind$umap2 +
scale_color_manual(values=cols)
initial_data_ind$umap2initial_data_ind$silhouette_plot <- initial_data_ind$silhouette_plot +
scale_color_manual(values=cols)
initial_data_ind$silhouette_plotdescriptors <- paste0(df %>% pull(individual), ", ", df %>% pull(type), ", seq: ", df %>% pull(sequence), ", voc: ", df %>% pull(vocalization))
initial_data_ind$umap_coords %>% plot_ly(x = ~X1, y = ~X2, color = ~individual, colors = c("#C84248", "#8569D5", "#5E738F", "#D1A33D", "#8A7C64", "#599861", "#89C5DA", "#DA5724", "#74D944", "#CE50CA"), type = 'scatter', mode = 'markers') %>%
layout(
plot_bgcolor = "#ffffff",
legend=list(title=list(text='Individual')),
xaxis = list(
title = "0"),
yaxis = list(
title = "1")) %>%
add_markers(text = descriptors)grid.arrange(
initial_data_type$umap, initial_data_ind$umap,
initial_data_type$silhouette_plot, initial_data_ind$silhouette_plot,
nrow=2, ncol = 2,
heights=c(0.5,1)
)R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64
(64-bit)
OS version: Windows 10 x64 (build 22000)
parallel: Support for Parallel computation in R
version 4.1.3stats: The R Stats Package
version 4.1.3graphics: The R Graphics
Package version 4.1.3grDevices: The R
Graphics Devices and Support for Colours and Fonts version
4.1.3utils: The R Utils Package version
4.1.3datasets: The R Datasets Package
version 4.1.3methods: Formal Methods and
Classes version 4.1.3base: The R Base
Package version 4.1.3plotly: Create
Interactive Web Graphics via ‘plotly.js’ version
4.10.0clues: Clustering Method Based on
Local version 0.6.2.2uwot: The Uniform
Manifold Approximation and Projection (UMAP) Method for Dimensionality
Reduction version 0.1.11Matrix: Sparse and
Dense Matrix Classes and Methods version
1.4-1gridExtra: Miscellaneous Functions for
“Grid” Graphics version 2.3forcats: Tools
for Working with Categorical Variables (Factors) version
0.5.1stringr: Simple, Consistent Wrappers for
Common String Operations version 1.4.0purrr:
Functional Programming Tools version
0.3.4readr: Read Rectangular Text Data
version 2.1.2tidyr: Tidy Messy Data version
1.2.0tibble: Simple Data Frames version
3.1.6ggplot2: Create Elegant Data Visualisations
Using the Grammar of Graphics version
3.3.5tidyverse: Easily Install and Load the
‘Tidyverse’ version 1.3.1dplyr: A Grammar of
Data Manipulation version 1.0.8knitr: A
General-Purpose Package for Dynamic Report Generation in R version
1.37